import requests
import re
import time
import random
import pymysql

class DyttSpider:
    def __init__(self):
        self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
        # 创建数据库连接对象和游标对象
        self.db = pymysql.connect(
            host='localhost',user='waqwb',password='qiu55555',database='dyttdb', charset='utf8'
        )
        self.cur = self.db.cursor()

    def get_html(self, url):
        """请求 解析 数据处理"""
        # content.decode() 手动转码
        # ignore参数: 忽略转码过程中的异常
        html = requests.get(url=url, headers=self.headers).content.decode('gbk', 'ignore')
        # 直接调用解析函数
        self.parse_html(html)

    def parse_html(self, html):
        """正则解析提取数据"""
        regex = '<td height="26">.*?<a href="(.*?)" class="ulink">(.*?)</a>'
        r_list = re.findall(regex, html, re.S)
        ins = 'insert into dytt_tab values(%s,%s)'
        for r in r_list:
            # 数据存入MySQL数据库
            self.cur.execute(ins, r)
            self.db.commit()
            print(r)

    def crawl(self):
        """程序入口函数"""
        for page in range(1, 101):
            page_url = self.url.format(page)
            # 请求解析
            self.get_html(url=page_url)
            # 控制频率
            time.sleep(random.randint(1, 3))
        # 断开数据库连接
        self.cur.close()
        self.db.close()

if __name__ == '__main__':
    spider = DyttSpider()
    spider.crawl()